# coding:utf-8
# 博图列表页
from bs4 import BeautifulSoup
import requests
import os
import time
import math

proxies = {
    "http": "192.168.0.71:8012",
}
hdrs = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.108 Safari/537.36',
    'Cookie': 'ASP.NET_SessionId=5wck1vuidcqjp5at5habah45',
    'Host': '222.198.130.68',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', }

root = r"E:\down_data\博图\old_html"


def down(url, bookid):
    pathroot = os.path.join(root, str(math.ceil(int(bookid) / 1000)))
    if not os.path.exists(pathroot):
        os.makedirs(pathroot)
    file_paht = os.path.join(pathroot, str(bookid) + '.html')
    if os.path.exists(file_paht):
        print("第" + str(bookid) + "条存在")
        return
    try:
        r = requests.get(url, proxies=proxies, headers=hdrs, timeout=10)
    except:
        print("timeout")
        return

    if r.text.find("404 - 找不到文件或目录。") > -1:
        print(url)
        print("网络错误")
        time.sleep(1800)
        return
    soup = BeautifulSoup(r.content, 'lxml')
    div = soup.find('b', id="bookname")
    if not div:
        print("html error")
        return
    with open(file_paht, 'wb') as f:
        f.write(r.content)
    print(str(bookid) + '下载成功')


if __name__ == "__main__":

    for line in open(r"E:\down_data\博图\botu_id10.txt", 'r', encoding="utf-8"):

        if line.strip():
            data = line.replace("\n", '')
            bookid = data
            url = "http://222.198.130.68/BookRead.aspx?bookid=" + bookid
            down(url, bookid)
# for data in sql_data  :
# bookid = data[0]
# url = "http://222.198.130.68/BookRead.aspx?bookid="+str(bookid)
# down(url,bookid)
